# Load the dataset
file_path <- "ufo-sightings-transformed.csv"
ufo_dataset <- read.csv(file_path)
# Overview of the dataset
head(ufo_dataset) X Date_time date_documented Year Month Hour Season Country_Code
1 0 1949-10-10 20:30:00 4/27/2004 1949 10 20 Autumn USA
2 1 1949-10-10 21:00:00 12/16/2005 1949 10 21 Autumn USA
3 2 1955-10-10 17:00:00 1/21/2008 1955 10 17 Autumn GBR
4 3 1956-10-10 21:00:00 1/17/2004 1956 10 21 Autumn USA
5 4 1960-10-10 20:00:00 1/22/2004 1960 10 20 Autumn USA
6 5 1961-10-10 19:00:00 4/27/2007 1961 10 19 Autumn USA
Country Region Locale latitude longitude UFO_shape
1 United States Texas San Marcos 29.88306 -97.941111 Cylinder
2 United States Texas Bexar County 29.38421 -98.581082 Light
3 United Kingdom England Chester 53.20000 -2.916667 Circle
4 United States Texas Edna 28.97833 -96.645833 Circle
5 United States Hawaii Kaneohe 21.41806 -157.803611 Light
6 United States Tennessee Bristol 36.59500 -82.188889 Sphere
length_of_encounter_seconds Encounter_Duration
1 2700 45 minutes
2 7200 1-2 hrs
3 20 20 seconds
4 20 1/2 hour
5 900 15 minutes
6 300 5 minutes
Description
1 This event took place in early fall around 1949-50. It occurred after a Boy Scout meeting in the Baptist Church. The Baptist Church sit
2 1949 Lackland AFB, TX. Lights racing across the sky & making 90 degree turns on a dime.
3 Green/Orange circular disc over Chester, England
4 My older brother and twin sister were leaving the only Edna theater at about 9 PM,...we had our bikes and I took a different route home
5 AS a Marine 1st Lt. flying an FJ4B fighter/attack aircraft on a solo night exercise, I was at 50ꯠ' in a "clean" aircraft (no ordinan
6 My father is now 89 my brother 52 the girl with us now 51 myself 49 and the other fellow which worked with my father if he's still livi
summary(ufo_dataset) X Date_time date_documented Year
Min. : 0 Length:80328 Length:80328 Min. :1906
1st Qu.:20082 Class :character Class :character 1st Qu.:2001
Median :40164 Mode :character Mode :character Median :2006
Mean :40164 Mean :2004
3rd Qu.:60245 3rd Qu.:2011
Max. :80327 Max. :2014
Month Hour Season Country_Code
Min. : 1.000 Min. : 0.00 Length:80328 Length:80328
1st Qu.: 4.000 1st Qu.:10.00 Class :character Class :character
Median : 7.000 Median :19.00 Mode :character Mode :character
Mean : 6.835 Mean :15.53
3rd Qu.: 9.000 3rd Qu.:21.00
Max. :12.000 Max. :23.00
Country Region Locale latitude
Length:80328 Length:80328 Length:80328 Min. :-82.86
Class :character Class :character Class :character 1st Qu.: 34.13
Mode :character Mode :character Mode :character Median : 39.41
Mean : 38.12
3rd Qu.: 42.79
Max. : 72.70
longitude UFO_shape length_of_encounter_seconds
Min. :-176.66 Length:80328 Min. : 0
1st Qu.:-112.07 Class :character 1st Qu.: 30
Median : -87.90 Mode :character Median : 180
Mean : -86.77 Mean : 9017
3rd Qu.: -78.75 3rd Qu.: 600
Max. : 178.44 Max. :97836000
Encounter_Duration Description
Length:80328 Length:80328
Class :character Class :character
Mode :character Mode :character
str(ufo_dataset)'data.frame': 80328 obs. of 17 variables:
$ X : int 0 1 2 3 4 5 6 7 8 9 ...
$ Date_time : chr "1949-10-10 20:30:00" "1949-10-10 21:00:00" "1955-10-10 17:00:00" "1956-10-10 21:00:00" ...
$ date_documented : chr "4/27/2004" "12/16/2005" "1/21/2008" "1/17/2004" ...
$ Year : int 1949 1949 1955 1956 1960 1961 1965 1965 1966 1966 ...
$ Month : int 10 10 10 10 10 10 10 10 10 10 ...
$ Hour : int 20 21 17 21 20 19 21 23 20 21 ...
$ Season : chr "Autumn" "Autumn" "Autumn" "Autumn" ...
$ Country_Code : chr "USA" "USA" "GBR" "USA" ...
$ Country : chr "United States" "United States" "United Kingdom" "United States" ...
$ Region : chr "Texas" "Texas" "England" "Texas" ...
$ Locale : chr "San Marcos" "Bexar County" "Chester" "Edna" ...
$ latitude : num 29.9 29.4 53.2 29 21.4 ...
$ longitude : num -97.94 -98.58 -2.92 -96.65 -157.8 ...
$ UFO_shape : chr "Cylinder" "Light" "Circle" "Circle" ...
$ length_of_encounter_seconds: num 2700 7200 20 20 900 300 180 1200 180 120 ...
$ Encounter_Duration : chr "45 minutes" "1-2 hrs" "20 seconds" "1/2 hour" ...
$ Description : chr "This event took place in early fall around 1949-50. It occurred after a Boy Scout meeting in the Baptist Church"| __truncated__ "1949 Lackland AFB, TX. Lights racing across the sky & making 90 degree turns on a dime." "Green/Orange circular disc over Chester, England" "My older brother and twin sister were leaving the only Edna theater at about 9 PM,...we had our bikes and I "| __truncated__ ...
colSums(ufo_dataset == "" | is.na(ufo_dataset)) X Date_time
0 0
date_documented Year
0 0
Month Hour
0 0
Season Country_Code
0 259
Country Region
259 566
Locale latitude
457 0
longitude UFO_shape
0 1930
length_of_encounter_seconds Encounter_Duration
0 0
Description
15
# # Remove rows where any column has missing or blank values
# ufo_dataset <- ufo_dataset[complete.cases(ufo_dataset), ]
# Remove rows with missing or blank values
ufo_dataset <- ufo_dataset[!(apply(ufo_dataset, 1, function(row) any(row == "" | is.na(row)))), ]
library(lubridate)
Attaching package: 'lubridate'
The following objects are masked from 'package:base':
date, intersect, setdiff, union
# Convert Date_time to datetime
ufo_dataset$Date_time <- ymd_hms(ufo_dataset$Date_time)
# Convert date_documented to date
ufo_dataset$date_documented <- mdy(ufo_dataset$date_documented)
# Convert columns to categorical variables (factors)
ufo_dataset$Season <- as.factor(ufo_dataset$Season)
ufo_dataset$Country_Code <- as.factor(ufo_dataset$Country_Code)
ufo_dataset$Country <- as.factor(ufo_dataset$Country)
ufo_dataset$Region <- as.factor(ufo_dataset$Region)
ufo_dataset$Locale <- as.factor(ufo_dataset$Locale)
ufo_dataset$UFO_shape <- as.factor(ufo_dataset$UFO_shape)